This report provides an exploratory data analysis (EDA) of the processed Heart Disease Training dataset.
train <- read.csv("../data/processed/train_scaled.csv", stringsAsFactors = TRUE)
print_basic_info <- function(df, target_col) {
cat("\nData overview\n")
print(dim(df))
str(df)
cat("\n\nSummary:\n")
print(summary(df))
cat("\nMissing values:\n")
print(check_missing(df))
cat("\nTarget class balance:\n")
print(class_balance(df))
}
print_basic_info(train, "HeartDisease")
##
## Data overview
## [1] 550 12
## 'data.frame': 550 obs. of 12 variables:
## $ Age : num 0.0757 0.6163 -1.7625 -0.8975 -1.33 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 2 2 1 ...
## $ ChestPainType : Factor w/ 4 levels "ASY","ATA","NAP",..: 1 1 3 3 2 1 1 1 2 2 ...
## $ RestingBP : num -0.14 -0.563 -0.14 -0.14 -0.405 ...
## $ Cholesterol : num -1.8373 0.3152 -0.0451 0.3429 -0.1375 ...
## $ FastingBS : num 1.735 -0.575 -0.575 -0.575 -0.575 ...
## $ RestingECG : Factor w/ 3 levels "LVH","Normal",..: 2 2 2 2 2 1 3 2 2 2 ...
## $ MaxHR : num -1.113 -0.83 0.503 0.261 1.715 ...
## $ ExerciseAngina: Factor w/ 2 levels "N","Y": 2 2 1 1 1 2 2 1 1 1 ...
## $ Oldpeak : num 1.946 0.36 -0.853 -0.76 -0.853 ...
## $ ST_Slope : Factor w/ 3 levels "Down","Flat",..: 2 1 3 3 3 2 2 2 3 3 ...
## $ HeartDisease : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 2 2 2 1 1 ...
##
##
## Summary:
## Age Sex ChestPainType RestingBP Cholesterol
## Min. :-2.73561 F:126 ASY:289 Min. :-7.0182 Min. :-1.8373
## 1st Qu.:-0.68120 M:424 ATA:112 1st Qu.:-0.6692 1st Qu.:-0.2021
## Median : 0.07569 NAP:122 Median :-0.1402 Median : 0.2043
## Mean : 0.00000 TA : 27 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.72445 3rd Qu.: 0.3889 3rd Qu.: 0.6270
## Max. : 2.56261 Max. : 3.5634 Max. : 3.3730
## FastingBS RestingECG MaxHR ExerciseAngina
## Min. :-0.5754 LVH :113 Min. :-3.13256 N:333
## 1st Qu.:-0.5754 Normal:337 1st Qu.:-0.70872 Y:217
## Median :-0.5754 ST :100 Median : 0.09923
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.:-0.5754 3rd Qu.: 0.73549
## Max. : 1.7347 Max. : 2.60387
## Oldpeak ST_Slope HeartDisease
## Min. :-2.2531 Down: 37 No :248
## 1st Qu.:-0.8533 Flat:282 Yes:302
## Median :-0.2467 Up :231
## Mean : 0.0000
## 3rd Qu.: 0.5465
## Max. : 4.9325
##
## Missing values:
## Age Sex ChestPainType RestingBP Cholesterol
## 0 0 0 0 0
## FastingBS RestingECG MaxHR ExerciseAngina Oldpeak
## 0 0 0 0 0
## ST_Slope HeartDisease
## 0 0
##
## Target class balance:
##
## No Yes
## 248 302
numeric_cols <- c("Age", "RestingBP", "Cholesterol",
"FastingBS", "MaxHR", "Oldpeak")
cat_cols <- c("Sex", "ChestPainType", "RestingECG",
"ExerciseAngina", "ST_Slope")
pie_data <- train %>%
count(HeartDisease) %>%
mutate(prop = n / sum(n),
lbl = paste0(HeartDisease, " (", scales::percent(prop), ")"))
ggplot(pie_data, aes(x = "", y = prop, fill = HeartDisease)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
theme_void() +
labs(title = "HeartDisease Class Distribution (Pie Chart)") +
geom_text(aes(label = lbl), position = position_stack(vjust = 0.5))
for (col in numeric_cols) {
print(
ggplot(train, aes_string(x = col)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
theme_minimal() +
labs(title = paste("Distribution of", col))
)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
for (col in cat_cols) {
print(
ggplot(train, aes_string(x = col)) +
geom_bar(fill = "orange", color = "black") +
theme_minimal() +
labs(title = paste("Distribution of", col))
)
}
for (col in numeric_cols) {
print(
ggplot(train, aes_string(x = "HeartDisease", y = col, fill = "HeartDisease")) +
geom_boxplot() +
theme_minimal() +
labs(title = paste(col, "by HeartDisease"))
)
}
for (col in cat_cols) {
print(
ggplot(train, aes_string(x = col, fill = "HeartDisease")) +
geom_bar(position = "fill") +
theme_minimal() +
labs(title = paste(col, "vs HeartDisease (Proportion)"),
y = "Proportion")
)
}
numeric_data <- train[, numeric_cols]
cor_mat <- cor(numeric_data)
corrplot(cor_mat,
method = "color",
type = "upper",
addCoef.col = "black",
number.cex = 0.7,
tl.cex = 0.8,
tl.col = "black")
key_pairs <- c("RestingBP", "Cholesterol", "MaxHR", "Oldpeak")
for (col in key_pairs) {
print(
ggplot(train, aes_string(x = "Age", y = col, color = "HeartDisease")) +
geom_point(alpha = 0.6, size = 2) +
theme_minimal() +
labs(
title = paste("Scatter Plot:", "Age vs", col),
x = "Age",
y = col
)
)
}
ggplot(train, aes(Age, MaxHR, color = HeartDisease)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", se = TRUE) +
theme_minimal() +
labs(title = "Age vs MaxHR with Trend Line")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(train, aes(MaxHR, Oldpeak, color = HeartDisease)) +
geom_point(alpha = 0.6) +
facet_wrap(~ Sex) +
theme_minimal() +
labs(title = "MaxHR vs Oldpeak (Faceted by Sex)")
ggpairs(
train[, c(numeric_cols, "HeartDisease")],
aes(color = HeartDisease, alpha = 0.5)
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.